MacTech 1 to 12

home *** CD-ROM | disk | FTP | other *** search

/ MacTech 1 to 12 / MacTech-vol-1-12.toast / Source / MacTech® Magazine / Volume 12 - 1996 / 12.02 Feb 96 / 12.02 Challenge / find.c < prev

Wrap

C/C++ Source or Header | 1995-12-11 | 20.5 KB | 786 lines | [TEXT/R*ch]

/* Copyright © 1995 Gustav Larsson */ /* ------------------------------------------------------ */ /* Constants & Types */ #define ALPHABET_SIZE 256 #define ALLOC_SIZE(n) ((n+3) & -4L) /* next multiple of 4 */ #define HASH_BUCKETS 1024 /* must be power of 2 */ #define HASH_MASK (HASH_BUCKETS - 1) #define NO_NULL_CHAR 'A' #define NULL 0 //#define TRUE 1 //#define FALSE 0 typedef unsigned char uchar; typedef unsigned short ushort; typedef unsigned long ulong; typedef struct Word Word; typedef struct Occurrence Occurrence; typedef struct Private Private; /* A block of occurrence positions. We pack in as many occurrences as possible into a single block, from 3 to 6 depending on textLength. The first entry in the block is always used. The remaining entries are in use if they are not zero. These facts are used several places to simplify the code. */ struct Occurrence { Occurrence *next; union { ushort pos2[6]; /* 2 bytes/occurrence */ struct { ushort lo[4]; uchar hi[4]; } pos3; /* 3 bytes/occurrence */ long pos4[3]; /* 4 bytes/occurrence */ } p; }; /* There is one Word struct for each distince word. The word's length is stored in the top eight bits of the hash value. There's no need to store the characters in the word since we can just look at the first occurrence (first entry in Word.first). */ struct Word { Word *next; ulong hash; Occurrence *last; Occurrence first; }; /* The structure of our private storage. The hashCodes[] array serves two purposes: it distinguishes alphanumeric from non-alphanumeric characters, and it provides a non-zero hash code for each alphanumeric character. The endParsedText field will be -1 if there was enough private memory to parse all the text. Otherwise, it points to the start of the unparsed text. nullChar is used by the BMH_Search() function when we must search unparsed text for an occurrence. */ struct Private { ulong hashCodes [ ALPHABET_SIZE ]; Word *hashTable [ HASH_BUCKETS ]; long endParsedText; /* start of parsed text */ long posBytes; /* POS_x_BYTES, below */ char nullChar; /* char not appearing in the text */ long heap; /* start of private heap */ }; /* ------------------------------------------------------ */ /* Macros */ /* These macros simplify access to the occurrence positions stored in an Occurrence struct. Posbytes is a macro argument that is usually set to private->posBytes. However, you can also use a constant for posbytes, which lets the compiler choose the right case at compile time, producing smaller and faster code. */ #define POS_2_BYTES 1 /* word position fits in 2 bytes */ #define POS_3_BYTES 0 /* fits in 3 bytes; usual case */ #define POS_4_BYTES 2 /* fits in 4 bytes */ #define GET_POS(pos,occur,index,posbytes) \ { \ if ( (posbytes) == POS_3_BYTES ) \ pos = ((long)(occur)->p.pos3.hi[index] << 16) \ + (occur)->p.pos3.lo[index]; \ else if ( (posbytes) == POS_2_BYTES ) \ pos = (occur)->p.pos2[index]; \ else \ pos = (occur)->p.pos4[index]; \ } #define SET_POS(pos,occur,index,posbytes) \ { \ if ( (posbytes) == POS_3_BYTES ) \ { \ (occur)->p.pos3.hi[index] = (pos) >> 16; \ (occur)->p.pos3.lo[index] = (pos); \ } \ else if ( (posbytes) == POS_2_BYTES ) \ (occur)->p.pos2[index] = pos; \ else \ (occur)->p.pos4[index] = pos; \ } /* ------------------------------------------------------ */ /* Prototypes */ void InitFind ( char *textToSearch, long textLength, void *privateStorage, long storageSize ); long FindWordOccurrence ( char *wordToFind, long wordLength, long occurrenceToFind, char *textToSearch, long textLength, void *privateStorage, long storageSize ); static long InitFindBody ( uchar *textToSearch, long textLength, Private *private, uchar *endPrivateStorage ); static Word *LookupWord ( Private *private, char *textToSearch, char *wordText, long wordLength, ulong hash ); static char PickNullChar ( Private *private, uchar *textStart, /* start of unparsed text */ uchar *textEnd /* end of all text */ ); static char *BMH_Search( ulong *hashCodes, /* private->hashCodes */ char *wordToFind, long wordLength, long occurrenceToFind, /* 0 is 1st occurrence */ char *textStart, /* start of unparsed text */ char *textEnd, /* end of all text */ char nullChar /* private->nullChar */ ); static char *SimpleSearch( ulong *hashCodes, /* private->hashCodes */ char *wordToFind, long wordLength, /* 1..3 */ long occurrenceToFind, /* 0 is 1st occurrence */ char *textStart, /* start of unparsed text */ char *textEnd /* end of all text */ ); /* ------------------------------------------------------ */ /* InitFind */ void InitFind ( char *textToSearch, long textLength, void *privateStorage, long storageSize ) { Private *private = privateStorage; private->endParsedText = InitFindBody( (uchar *)textToSearch, textLength, privateStorage, (uchar *)privateStorage + storageSize ); if ( private->endParsedText != -1 ) private->nullChar = PickNullChar( private, (uchar *)textToSearch + private->endParsedText, (uchar *)textToSearch + textLength ); else private->nullChar = NO_NULL_CHAR; } /* ------------------------------------------------------ */ /* InitFindBody */ /* This function does most of the work for InitFind(). The arguments have been recast into a more useful form; uchar and ulong are used a lot so that we don't have to worry about the sign, especially when indexing private->hashCodes[]. The return value is the character position when the unparsed text begins (if we run out of private storage), or -1 if all the text was parsed. */ static long InitFindBody ( uchar *textToSearch, long textLength, Private *private, uchar *endPrivateStorage ) { uchar *alloc, *textPos, *textEnd, *wordStart; long wordLength; ulong hash, code; Word *word; Occurrence *occur; /* Init table of hash codes. The remaining entries are guaranteed to be initialized to zero. The hash codes were chosen so that any two codes differ by at least five bits. */ { ulong *table = private->hashCodes; /* reduces typing */ table['0'] = 0xFFC0; table['5'] = 0xF492; table['1'] = 0xFE07; table['6'] = 0xF31E; table['2'] = 0xF98B; table['7'] = 0xF2D9; table['3'] = 0xF84C; table['8'] = 0xCF96; table['4'] = 0xF555; table['9'] = 0xCE51; table['A'] = 0xC9DD; table['N'] = 0xA245; table['B'] = 0xC81A; table['O'] = 0x9F0A; table['C'] = 0xC503; table['P'] = 0x9ECD; table['D'] = 0xC4C4; table['Q'] = 0x9941; table['E'] = 0xC348; table['R'] = 0x9886; table['F'] = 0xC28F; table['S'] = 0x959F; table['G'] = 0xAF5C; table['T'] = 0x9458; table['H'] = 0xAE9B; table['U'] = 0x93D4; table['I'] = 0xA917; table['V'] = 0x9213; table['J'] = 0xA8D0; table['W'] = 0x6DD3; table['K'] = 0xA5C9; table['X'] = 0x6C14; table['L'] = 0xA40E; table['Y'] = 0x6B98; table['M'] = 0xA382; table['Z'] = 0x6A5F; table['a'] = 0x6746; table['n'] = 0x3C88; table['b'] = 0x6681; table['o'] = 0x3B04; table['c'] = 0x610D; table['p'] = 0x3AC3; table['d'] = 0x60CA; table['q'] = 0x37DA; table['e'] = 0x5D85; table['r'] = 0x361D; table['f'] = 0x5C42; table['s'] = 0x3191; table['g'] = 0x5BCE; table['t'] = 0x3056; table['h'] = 0x5A09; table['u'] = 0x0D19; table['i'] = 0x5710; table['v'] = 0x0CDE; table['j'] = 0x56D7; table['w'] = 0x0B52; table['k'] = 0x515B; table['x'] = 0x0A95; table['l'] = 0x509C; table['y'] = 0x078C; table['m'] = 0x3D4F; table['z'] = 0x064B; } /* Determine the number of bytes needed to store each occurrence position. */ if ( textLength <= 0x10000L ) private->posBytes = POS_2_BYTES; else if ( textLength <= 0x1000000L ) private->posBytes = POS_3_BYTES; else private->posBytes = POS_4_BYTES; /* Set up variables to handle allocation of private storage. */ alloc = (uchar *)&private->heap; /* Parse the text */ textPos = textToSearch; textEnd = textPos + textLength; while ( textPos != textEnd ) { /* Search for start of word */ while ( private->hashCodes[*textPos] == 0 ) { textPos++; if ( textPos == textEnd ) return -1; /* parse all text */ } wordStart = textPos; /* Search for end of word; generate hash value too */ hash = 0; while ( textPos != textEnd && (code = private->hashCodes[ *textPos ]) != 0 ) { hash = (hash << 1) ^ code; textPos++; } wordLength = textPos - wordStart; hash = (hash & 0xFFFFFF) | (wordLength << 24); /* Record the occurrence. First we see if a Word struct exists for this word and whether we need to allocate a new Occurrence struct. */ word = LookupWord( private, (char *)textToSearch, (char *)wordStart, wordLength, hash ); if ( word ) { long allocateNewBlock, blockSize, i, pos; /* This word has occurred before, so it already has a Word struct. See if there's room in the last Occurrence block for another entry. Remember that entry #0 in the Occurrence block is always in use, so we can start checking at entry #1 for a non-zero entry. */ occur = word->last; allocateNewBlock = TRUE; switch ( private->posBytes ) { case POS_2_BYTES: blockSize = 6; break; case POS_3_BYTES: blockSize = 4; break; case POS_4_BYTES: blockSize = 3; break; } for ( i = 1; i < blockSize; i++ ) { GET_POS( pos, occur, i, private->posBytes ) if ( pos == 0 ) { SET_POS( wordStart - textToSearch, occur, i, private->posBytes ) allocateNewBlock = FALSE; break; } } if ( allocateNewBlock ) { /* Block is full. Allocate new Occurrence block */ occur = (Occurrence *) alloc; alloc += ALLOC_SIZE( sizeof(Occurrence) ); if ( alloc >= endPrivateStorage ) return wordStart-textToSearch; /* out of memory */ /* Init the new struct and link it to the end of the occurence list. */ SET_POS( wordStart - textToSearch, occur, 0, private->posBytes ) word->last->next = occur; word->last = occur; } } else { long i; /* This is a new word. Allocate a new Word struct, which contains an Occurrence struct too. */ word = (Word *) alloc; alloc += ALLOC_SIZE( sizeof(Word) ); if ( alloc >= endPrivateStorage ) return wordStart-textToSearch ; /* out of memory */ /* Link it to the start of the Word list, coming off the hash table. */ word->next = private->hashTable[ hash & HASH_MASK ]; private->hashTable[ hash & HASH_MASK ] = word; /* Init the Word struct */ word->hash = hash; word->last = &word->first; /* Init the Occurrence struct */ SET_POS( wordStart - textToSearch, &word->first, 0, private->posBytes ) } } /* Finished parsing text */ return -1; } /* ------------------------------------------------------ */ /* FindWordOccurrence */ long FindWordOccurrence ( char *wordToFind, long wordLength, long occurrenceToFind, char *textToSearch, long textLength, void *privateStorage, long storageSize ) { Private *private = privateStorage; Word *word; ulong hash; /* Make occurenceToFind zero-based */ occurrenceToFind--; /* Generate hash value for word to find */ hash = 0; { long remain = wordLength; uchar *p = (uchar *) wordToFind; while ( remain > 0 ) { hash = (hash << 1) ^ private->hashCodes[*p++]; remain--; } hash = (hash & 0xFFFFFF) | (wordLength << 24); } /* Look for word/occurrence in hash table */ word = LookupWord( private, textToSearch, wordToFind, wordLength, hash ); if ( word ) { Occurrence *occur = &word->first; long blockSize, pos, i; /* Word exists in hash table, so go down the occurrence list. */ switch ( private->posBytes ) { case POS_2_BYTES: blockSize = 6; break; case POS_3_BYTES: blockSize = 4; break; case POS_4_BYTES: blockSize = 3; break; } while ( occur && occurrenceToFind >= blockSize ) { occurrenceToFind -= blockSize; occur = occur->next; } if ( occur ) { GET_POS( pos, occur, occurrenceToFind, private->posBytes ) if ( occurrenceToFind == 0 || pos != 0 ) return pos; occurrenceToFind -= blockSize; } occur = word->last; for ( i = 0; i < blockSize; i++ ) { GET_POS( pos, occur, i, private->posBytes ) if ( pos == 0 ) occurrenceToFind++; } } /* Not in parsed text, so check the unparsed text */ if ( private->endParsedText != -1 ) { char *p; if ( wordLength > 3 ) p = BMH_Search( private->hashCodes, wordToFind, wordLength, occurrenceToFind, textToSearch + private->endParsedText, textToSearch + textLength, private->nullChar ); else p = SimpleSearch( private->hashCodes, wordToFind, wordLength, occurrenceToFind, textToSearch + private->endParsedText, textToSearch + textLength ); if (p) return (p - textToSearch); } /* Not found */ return -1; } /* ------------------------------------------------------ */ /* LookupWord */ /* Look up a word in the hash table */ static Word *LookupWord ( Private *private, char *textToSearch, char *wordText, long wordLength, ulong hash ) { Word *word = private->hashTable[ hash & HASH_MASK ]; while ( word ) { if ( word->hash == hash ) { char *w1, *w2; long pos, remain = wordLength; /* The hash values match, so compare characters to make sure it's the right word. We already know the word length is correct since the length is contained in the upper eight bits of the hash value. */ GET_POS( pos, &word->first, 0, private->posBytes ) w1 = textToSearch + pos; w2 = wordText; while ( remain-- > 0 && *w1++ == *w2++ ) ; if ( remain == -1 ) return word; } word = word->next; } return NULL; } /* ------------------------------------------------------ */ /* PickNullChar */ /* Find a character that doesn't appear anywhere in the unparsed text. BMH_Search() is faster if such a character can be found. */ static char PickNullChar ( Private *private, uchar *textStart, uchar *textEnd ) { long i; uchar *p, occurs[ ALPHABET_SIZE ]; for ( i = 0; i < ALPHABET_SIZE; i++ ) occurs[i] = FALSE; for ( p = textStart; p < textEnd; p++ ) occurs[*p] = TRUE; for ( i = 0; i < ALPHABET_SIZE; i++ ) if ( occurs[i] == FALSE && private->hashCodes[i] == 0 ) return i; return NO_NULL_CHAR; } /* ------------------------------------------------------ */ /* BMH_Search */ /* Search the unparsed text using the Boyer-Moore-Horspool algorithm. Ideally a null character is supplied (one that appears in neither the search string nor the text being searched). This allows the inner loop to be faster. */ static char *BMH_Search ( ulong *hashCodes, /* private->hashCodes */ char *wordToFind, long wordLength, long occurrenceToFind, /* 0 is first occurrence */ char *textStart, /* start of unparsed text */ char *textEnd, /* end of unparsed text */ char nullChar /* private->nullChar */ ) { long i; char *text, *wordEnd; char word[256]; long offset[ ALPHABET_SIZE ]; /* Copy the search string to a private buffer, where the first character is the null character. */ word[0] = nullChar; for ( i = 0; i < wordLength; i++ ) word[i+1] = wordToFind[i]; /* Set up the offset[] lookup table */ for ( i = 0; i < ALPHABET_SIZE; i++ ) offset[i] = wordLength; for ( i = 1; i < wordLength; i++ ) offset[ word[i] ] = wordLength - i; /* Let the search begin... */ wordEnd = word + wordLength; text = textStart + wordLength - 1; if ( nullChar == NO_NULL_CHAR ) { /* No null character, so use a slower inner loop */ while ( text < textEnd ) { long i; char *p, *q; for ( i = wordLength, p = wordEnd, q = text; i > 0 && *p == *q; i--, p--, q-- ) ; /* If i == 0, we have found the search string. Now we make sure that it is delimited. */ if ( i == 0 && hashCodes[*q] == 0 && (text+1 == textEnd || hashCodes[text[1]] == 0) ) { if ( occurrenceToFind == 0 ) return q+1; occurrenceToFind--; } text += offset[*text]; } } else { /* There is a null character (usual case), so we can use a faster and simpler inner loop. */ while ( text < textEnd ) { char *p, *q; for ( p = wordEnd, q = text; *p == *q; p--, q-- ) ; if ( p == word && hashCodes[*q] == 0 && (text+1 == textEnd || hashCodes[text[1]] == 0) ) { if ( occurrenceToFind == 0 ) return q+1; occurrenceToFind--; } text += offset[*text]; } } return NULL; } /* ------------------------------------------------------ */ /* SimpleSearch */ /* Search the unparsed text using a simple search algorithm. Note that wordLength must be 1, 2, or 3. This algorithm runs faster than BMH_Search() for small search strings. */ static char *SimpleSearch( ulong *hashCodes, /* private->hashCodes */ char *wordToFind, long wordLength, /* 1..3 */ long occurrenceToFind, /* 0 is 1st occurrence */ char *textStart, /* start of unparsed text */ char *textEnd /* end of all text */ ) { char *text, first; first = wordToFind[0]; text = textStart; if ( wordLength == 1 ) { while ( text < textEnd ) { while ( text < textEnd && *text != first ) text++; if ( hashCodes[*(text-1)] == 0 && hashCodes[text[wordLength]] == 0 ) { if ( occurrenceToFind == 0 ) return text; occurrenceToFind--; } text++; } } else if ( wordLength == 2 ) { while ( text < textEnd ) { while ( text < textEnd && *text != first ) text++; if ( text[1] == wordToFind[1] && hashCodes[*(text-1)] == 0 && hashCodes[text[wordLength]] == 0 ) { if ( occurrenceToFind == 0 ) return text; occurrenceToFind--; } text++; } } else /* wordLength == 3 */ { while ( text < textEnd ) { while ( text < textEnd && *text != first ) text++; if ( text[1] == wordToFind[1] && text[2] == wordToFind[2] && hashCodes[*(text-1)] == 0 && hashCodes[text[wordLength]] == 0 ) { if ( occurrenceToFind == 0 ) return text; occurrenceToFind--; } text++; } } return NULL; }